Load data

clinical_data <-
  read_tsv(
  file = "../data/clinical_data_clean.tsv")
## Rows: 1904 Columns: 19
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (9): cancer_type_detailed, type_of_breast_surgery, cellularity, P50CL_Subtype, TGC_Subtype, her2_status, er_statu...
## dbl (10): age_at_diagnosis, chemotherapy, radio_therapy, hormone_therapy, lymph_nodes_examined_positive, neoplasm_hist...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gene_expression <-
  read_tsv(
  file = "../data/gene_expression.tsv")
## Rows: 1904 Columns: 489
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (489): brca1, brca2, palb2, pten, tp53, atm, cdh1, chek2, nbn, nf1, stk11, bard1, mlh1, msh2, msh6, pms2, epcam, r...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gene_expression_aug <-
  read_tsv(
  file = "../data/gene_expression_aug.tsv")
## Rows: 931056 Columns: 3
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): gene
## dbl (2): overall_survival, expr_level
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Gene expression analysis

In this analysis we would like to perform a PCA analysis on the genes to explore clusters, trends etc. Moreover, we would like to fit a linear regression model to each of the genes.

PCA

Fitting PCA to data

Fitting PCA to the gene expression data, centering the data and scaling it for normalization.

pca_fit <- 
  gene_expression |> 
  prcomp(center = TRUE,
         scale = TRUE)

Variance explained by each PC component

Utilizing the tidy function to extract the eigenvalues from each PC component, slicing the 10 first rows as these are the only interesting, and plotting each component and variance explained as a bar chart.

plot_variance <- 
  pca_fit |> 
  tidy(matrix = "eigenvalues") |> 
  slice(1:10) |> 
  ggplot(aes(PC, 
             percent)) +
  geom_col(fill = "blue", 
           alpha = 0.7) +
  scale_x_continuous(breaks = 1:10) +
  scale_y_continuous(labels = scales::percent_format())

plot_variance

Plotting PC1 vs PC2

Using augment function to add the clinical data variables to the existing data frame, and plotting the first and second component.

PCA_plot <- 
  pca_fit |> 
  augment(clinical_data) |> 
  ggplot(aes(.fittedPC1, 
             .fittedPC2)) + 
  geom_point(size = 0.5) +
  labs(x = "Principal Component 1",
       y = "Principal Component 2") +
  theme(legend.position="right")

Coloring the PCA according to clinical data

PCA_er_status <- 
  PCA_plot +
  aes(color = er_status)

PCA_her2_status <- 
  PCA_plot +
  aes(color = her2_status)

PCA_pr_status <-
  PCA_plot +
  aes(color = pr_status)

PCA_Chemotherapy <- 
  PCA_plot +
  aes(color = factor(chemotherapy)) +
  labs(color = "Chemotherapy")

PCA_Hormonetherapy <- 
  PCA_plot +
  aes(color = factor(hormone_therapy)) +
  labs(color = "Hormone therapy")

PCA_Radiotherapy <- 
  PCA_plot +
  aes(color = factor(radio_therapy)) +
  labs(color = "Radio therapy")

PCA_survival <- 
  PCA_plot +
  aes(color = factor(overall_survival)) +
  labs(color = "Overall Survival")

Using patchwork to combine the plots into two plots

plot_status <- 
  PCA_er_status + PCA_her2_status / PCA_pr_status
plot_status

plot_therapy <-
  PCA_Chemotherapy + PCA_Hormonetherapy / PCA_Radiotherapy
plot_therapy

PCA_survival

Linear regression

Data wrangle

Firstly, the data is grouped by gene and outcome nested into one variable.

gene_expression_aug <-
  gene_expression_aug |>
  group_by(gene) |>
  nest() |> 
  ungroup()

Fitting models

Letting R know we are working by gene

gene_expression_aug <-
  gene_expression_aug |> 
  group_by(gene)

Next, we will use the map() function to apply a linear regression model to each gene, with gene_expression vs. overall_survival outcome.

gene_expression_aug <- 
  gene_expression_aug |> 
  mutate(model_object = map(.x = data,
                   .f = ~lm(formula = expr_level ~ overall_survival,
                            data = .x)))

Tidying models

gene_expression_aug <-
  gene_expression_aug |> 
  mutate(model_object_tidy = map(model_object,
                                 ~ tidy(.x,
                                        conf.int = TRUE,
                                        conf.level = 0.95)))
gene_expression_aug |> 
  sample_n(1)

Next, we would like to un-nest the model_object_tidy to reveal the variables and values.

gene_expression_aug <-
  gene_expression_aug |> 
  unnest(model_object_tidy)

After un-nesting the data, we would like to only keep gene name, p.value, estimate and confidence level

gene_estimates <- 
  gene_expression_aug |> 
  filter(term == "overall_survival") |> 
  select(gene, 
         p.value, 
         estimate, 
         conf.low, 
         conf.high) |> 
  ungroup()

Next, we would like to evaluate if the gene is considered significant to death or not

gene_estimates <- 
  gene_estimates |> 
  mutate(
    q.value = p.adjust(p.value),
    is_significant = case_when(
      q.value < 0.05 ~ "yes",
      TRUE ~ "no"
    )
  )
gene_estimates_significant <-
  gene_estimates |> 
  filter(is_significant == "yes") |> 
  mutate(gene = fct_reorder(gene, estimate))

Significant genes associated with death of cancer

plot_signif_gene <- 
  gene_estimates_significant |> 
  ggplot(aes(x = estimate, 
             y = gene)) +
  geom_point(aes(x = estimate), 
             size = 2) + 
  geom_errorbarh(aes(xmin = conf.low, 
                     xmax = conf.high), 
                 height = 0.2) +
  geom_vline(xintercept = 0) +
  theme(plot.title = element_text(size = 20, 
                                  hjust = 0.5),
        axis.text.y = element_text(size = 14),
        axis.text.x = element_text(size = 14),
        axis.title.x = element_text(size = 16),
        axis.title.y = element_text(size = 16)) +
  labs(title = "Significant Up/Down regulated genes associated with death of cancer", 
       y = "Genes",
       x = "Estimate")

plot_signif_gene

Write plots

ggsave("06_PCA_variance.png", 
       path = "../doc/images/",
       plot = plot_variance,
       width = 8,
       height = 5)

ggsave("06_PCA_status.png", 
       path = "../doc/images/",
       plot = plot_status,
       width = 8,
       height = 5)

ggsave("06_PCA_therapy.png", 
       path = "../doc/images/",
       plot = plot_therapy,
       width = 8,
       height = 5)

ggsave("06_signif_genes.png", 
       path = "../doc/images/",
       plot = plot_signif_gene,
       width = 20,
       height = 20)

ggsave("06_PCA_survival.png", 
       path = "../doc/images/",
       plot = PCA_survival,
       width = 8,
       height = 5)